####################################### LINEAR REGRESSION ####################################### options(show.signif.stars=FALSE, digit=4) ###################### Roller Data Example ###################### roller<-read.table("roller.txt",header=TRUE) head(roller, n=3) roller; attach(roller) y = depression; bar_x = mean(x) x = weight ; bar_y = mean(y) detach(roller) n = length(x) s_xx = sum((x-bar_x)^2) # sum of squares of the predictor s_xx; sd(x)^2*(n-1) # check the formula with 'sd' s_xy = sum((x-bar_x)*(y-bar_y)) # sum of cross products s_xy; cov(y,x)*(n-1) # check the formula with 'cov' hat.beta_1 = s_xy/s_xx # least square estimate of slope hat.beta_0 = bar_y - hat.beta_1 * bar_x # least square estimate of intercept c(intercept=hat.beta_0,slope=hat.beta_1) plot(x,y,xlab="",ylab="",main="") # scatterplot abline(a=hat.beta_0,b=hat.beta_1) # add the regression line title(xlab=list("Roller weight (t)",cex=1), ylab=list("Depression in lawn (mm)",cex=1)) mtext("Depression vs weight, roller data",side=3,line=1,cex=1.2) ###################### R function 'lm()' ###################### homedata<-read.table("homedata.txt",header=TRUE) head(homedata, n=3) attach(homedata) plot(y1970,y2000) cor(y1970,y2000) # as suggested by the scatterplot, there is a linear (and positive) relation res = lm(y2000~y1970) # IMP - linear models - 'response~predictor' coef(res) names(res) res$coefficients plot(y1970,y2000); abline(res, col=2, lwd=2) x.star = 50000 # for a given x value coef(res)[1] + coef(res)[2]*x.star # [b0 + b1 * x] t(coef(res))%*%c(1,x.star) # same using matrix algebra predict(res,data.frame(y1970=x.star)) resid(res) # epsilon_i y2000-fitted(res) # Y - Y_cappuccio predict(res,data.frame(y1970=55100)) # perform a prediction as above with x=55100 lm(y2000~y1970, data=homedata) # the use of model formula offers the flexibility # to specify the variable names of the data frame # without attaching the data frame ###################### Outliers in the regression model - REGULARITIES IN THE VOTE ###################### florida<-read.table("florida.txt",header=TRUE) head(florida, n=3) attach(florida); str(florida); head(florida,n=3) # candidates were Pat Buchanan and George Bush: is there some relationship between # votes for the two candidates, since both conservatives candidates? plot(BUCHANAN~BUSH, data=florida) res = lm(BUCHANAN~BUSH, data=florida) abline(res) with(florida, identify(BUSH,BUCHANAN,n=2, labels=County)) # identify points by mouse over them. It gives (row indexes). # 'n=2' specify the number of points to identify. # 'labels=County' is optional and allows for the placement of other text # ora identifichiamo i due outliers County[c(13,50)] class(County) ycapp = predict(res,data.frame(BUSH=BUSH[c(13,50)])) y = BUCHANAN[c(13,50)] residui = y - ycapp residuals(res)[c(13,50)] # predicted amount and residual for Dade and Palm Beach ###################### Simulated data ###################### age = rep(seq(20,60,by=5), 3) # In sport activity heart rates are used for optimal training mhr = 209-0.7*age+rnorm(length(age),sd=4) plot(mhr~age, main="Age versus maximum heart rate") res.mhr = lm(mhr~age); coef(res.mhr); abline(res.mhr,lty=2) # regression line c(true_intercept=209, true_slope=-0.7) abline(a=209,b=-0.7) # actual model legend(40,195,bty="n",legend=c("Actual model", "Estimated regression line"),lty=c(1,2)) RSS1 = sum(resid(res.mhr)^2)/(length(age)-2) # RSS=Residual Sum of Squares RSS2 = deviance(res.mhr)/(length(age)-2) #Testing model assumption help(plot.lm); plot(res.mhr,which=4) old.par = par(no.readonly = TRUE) par(mfrow=c(2,2)) plot(res.mhr,which=1:4) par(old.par) cooks.distance(res.mhr) summary(res.mhr)$resid plot(res.mhr, which=4) plot(1:length(age),cooks.distance(res.mhr),xlab="Obs. number", ylab="Cook's distance",type="h") length(cooks.distance(res.mhr)) cooks = cooks.distance(res.mhr) ind = which(cooks>0.08) text(ind,cooks[ind],labels=paste(ind),pos=3,cex=0.8) ###################### Assessing linearity - RESIDUAL PLOT ###################### x = rep(1:10,4) y = 5*sin(x) + rnorm(40,sd=1) # the relation between x and y is sinusoidal trend plot(x,y) res = lm(y~x) names(res) old.par = par(no.readonly = TRUE) par(fig=c(0, 0.5, 0.2, 0.8)) plot(y~x); abline(res) curve(5*sin(x), lty=2, col=2, lwd=2, add=T) par(fig=c(0.5, 1, 0.2, 0.8), new=TRUE) plot(resid(res)~fitted(res)) #they should scatter around the horizontal line at zero par(old.par) plot(res,which=1) # it corresponds to the first plot ###################### Normality of the residual - NORMAL QQ PLOT ###################### x = rep(1:10,4) # this data set is created to fulfill the normality PREDICTOR epsilon = rnorm(40,sd=1) # assumption but not the independence of the error y = 1+2*x+cumsum(epsilon) # In this case the error cumulates plot(x,y) res = lm(y~x) e = resid(res) # residual std.e = (e-mean(e))/sd(e) # standardized residual ERROR CUMULATES OVER OBS qqnorm(std.e);abline(a=0,b=1,lty=2) # qqplot to assess normality - INDEP NOT SATISF plot(res,which=2) # it corresponds to the second plot plot(e[-40],e[-1]) # IMP - the lag plot should be scattered if the errors cor(e[-40],e[-1]) # were uncorrelated plot(acf(e)) # autocorrelation function, first bar is 1 # second bar is cor(e[-40],e[-1]), # third bar is cor(e[c(-40,.39)],e[c(-1,-2)]) and so on # they should be all small in absolute value # INDEPENDENCE IF ALL VALUE AROUND 0 ####################### THE ERROR TERMS MUST HAVE COMMON VARIANCE ###################### x = rep(1:10,4) # PREDICTOR VARIABLE y = 1+(1/2)*x+rnorm(40,sd = x/10) # the error terms have variance proportional to the values of the predictor plot(x,y) # A DIFFERENT SD PER OGNI PUNTO res = lm(y~x) plot(resid(res)~fitted(res)); abline(h=0,lty=2) # the spread gets larger for larger values of x e = resid(res) # residuals std.e = (e-mean(e))/sd(e) # standardized residuals plot(sqrt(abs(std.e))~fitted(res)) # the spread of the points gets larger for larger values of hat.y plot(res,which=3) # it corresponds to the third plot ####################### COOK'S DISTANCE PLOT (residuals vs leverage) ######################## emissions<-read.table("emissions.txt",header=TRUE) head(emissions, n=3) # data on CO2 emissions and per-capita gross domestic product (GDP) for several countries res = lm(CO2~perCapita, data=emissions) plot(CO2~perCapita,data=emissions) abline(res) attach(emissions) identify(perCapita,CO2,n=2, labels=rownames(emissions)) detach(emissions) cooks.distance(res) # Cook's distance, calculated as the difference of hat.y_i with and without the i-th observation # high value of Cook's distance is signal of a influential point plot(res,which=4) # diagnostic graph for influential points >1 INFLUENTIAL n = length(emissions$CO2) ####################### Statistical Inferences - CREATE A VECTOR WITH THE ESTIMATES ################# n.simu = 10; sigma0 = 5 # standard error of the residual hat.th = array(0,c(n.simu,3)) # where we store hat.beta_0, hat.beta_1, (hat.sigma)^2 x = rep(1:10,10) # predictor values plot(y~x); abline(res) for(i in 2:n.simu) { y = x+rnorm(100,sd=sigma0) # simulate response values res = lm(y~x) hat.th[i,1:2] = coef(res) # coefficients hat.th[i,3] = deviance(res)/(100-2) # s^2 = RSS/df = unbiased estimate of sigma0^2 abline(res) # add the i-th regression line } titl = expression(paste("Simulated data ",Y[i]==x[i]+epsilon[i]," , ", sigma==5)) mtext(titl,side=3,line=1,cex=1.2) old.par = par(no.readonly = TRUE) par(fig=c(0, 0.5, 0.2, 0.8)) hist(hat.th[,1],prob=TRUE) lines(density(hat.th[,1])) SE_0 = sigma0*sqrt(sum(x^2)/ (length(x)*sum((x-mean(x))^2))) # Standard Error of hat.beta_0 lines(sort(hat.th[,1]),dnorm(sort(hat.th[,1]),mean=0,sd=SE_0),lty=2,col=2,lwd=2) # theoretical distribution of hat.beta_0 par(fig=c(0.5, 1, 0.2, 0.8), new=TRUE) hist(hat.th[,2],prob=TRUE) lines(density(hat.th[,2])) SE_1 = sigma0*sqrt(1/sum((x-mean(x))^2)) lines(sort(hat.th[,2]),dnorm(sort(hat.th[,2]), mean=1,sd=SE_1),lty=2,col=2,lwd=2)# theoretical distribution of hat.beta_1 par(old.par) qqnorm(hat.th[,1],main="Normal QQplot beta_0"); qqline(hat.th[,1]) qqnorm(hat.th[,2],main="Normal QQplot beta_1")# normal Q-Q plot of hat.beta_1 ; qqline(hat.th[,2]) score = hat.th[,3]*(100-2)/sigma0^2 # s^2 chis-quare distributed (after proper rescaling) plot(qchisq(ppoints(score),100-2),sort(score), main="Chi-squared QQplot sigma.sq"); abline(0,1) # back to mhr data age = rep(seq(20,60,by=5), 3) mhr = 209-0.7*age+rnorm(length(age),sd=4) res.mhr = lm(mhr~age) summary(res.mhr) n = length(age) hat.sigma = sqrt( sum(resid(res.mhr)^2)/(n-2) ) # s^2 = RSS/df = unbiased estimate of sigma0^2 = pag18 sqrt(deviance(res.mhr)/(n-2)) # the same RSS/(n-2) c(sigma=hat.sigma) summary(res.mhr)$sigma names(summary(res.mhr)) unclass(summary(res.mhr)) # gives 'summary()' as a list # the table named 'Coefficients' reports the # least squared estimates of bets_0 and beta_1, # the corresponding SE, the t-score and p-vale for # H_0: bata_i=0 vs beta_i!=0, i=0,1 hat.beta_0 = coef(res.mhr)[1] # Inference on beta_0 summary(res.mhr) SE_0 = hat.sigma* sqrt( sum(age^2)/(n*sum( (age-mean(age))^2)) ) # SE of hat.beta_0 by formula c(Estimate=hat.beta_0,Std._Error=SE_0); t_score = (hat.beta_0-0)/SE_0 # H_0: beta_0=0 vs H_1: beta_0!=0 p_value = 2*(1-pt(abs(t_score),df=n-2)) c(t_value=t_score,p=p_value) # intercept is significantly different from zero round(hat.beta_0+qt(c(0.025,0.975),n-2)*SE_0,2) # 95% confidence interval summary(res.mhr)$coef hat.beta_1 = coef(res.mhr)[2] # Inference on beta_1 summary(res.mhr) SE_1 = hat.sigma*sqrt( 1/sum((age-mean(age))^2 )) c(Estimate=hat.beta_1,Std._Error=SE_1) t_score = (hat.beta_1-0)/SE_1 # H_0: beta_1=0 vs H_1: beta_1!=0 p_value = 2*(1-pt(abs(t_score),df=n-2)) c(t_value=t_score,p=p_value) round(hat.beta_1+qt(c(0.025,0.975),n-2)*SE_1,2) summary(res.mhr)$coef ####################### Anova and Coefficient of Determination (OPTIONAL) ################### r.sq = cor(age,mhr)^2 # equal to square of Pearson correlation coeff c(R_Squared=r.sq) summary(res.mhr)$r.squared TSS = sum( (mhr-mean(mhr))^2 ) # Total Sum of Squares TSS RSS = sum( (mhr-fitted(res.mhr))^2 ) # Residual Sum of Squares RSS ESS = Reg.SS = sum( (fitted(res.mhr)-mean(mhr))^2 ) # Regression Sum of Squares ESS explained RSS; TSS-Reg.SS; deviance(res.mhr) R2 = ESS / TSS; R2; r.sq # R2 Coefficient of Determination anova(res.mhr) # Analysis of Variance F_score = Reg.SS/(deviance(res.mhr)/(n-2)) # H_0: beta_1=0 vs H_1: beta_1!=0 i.e. first row of anova(res.mhr) t_score = (hat.beta_1-0)/SE_1 # t-test statistic for the same hypothesis test F_score; t_score^2 # the same 1-pf(F_score,df1=1,df2=n-2) # p-value of the F-test for # H_0: beta_1=0 vs H_1: beta_1!=0 # correponds to first row of anova(res.mhr) 2*(1-pt(abs(t_score),df=n-2)) # the same p-value given by t-test